Exercise 22

Author

Alex Smilor

library(dataRetrieval)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom        1.0.7     ✔ rsample      1.2.1
✔ dials        1.3.0     ✔ tune         1.2.1
✔ infer        1.0.7     ✔ workflows    1.1.4
✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
✔ parsnip      1.2.1     ✔ yardstick    1.3.1
✔ recipes      1.1.0     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Learn how to get started at https://www.tidymodels.org/start/
library(modeltime)
library(tsibble)
Registered S3 method overwritten by 'tsibble':
  method               from 
  as_tibble.grouped_df dplyr

Attaching package: 'tsibble'

The following object is masked from 'package:lubridate':

    interval

The following objects are masked from 'package:base':

    intersect, setdiff, union
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
library(forecast)
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

Attaching package: 'forecast'

The following object is masked from 'package:yardstick':

    accuracy
library(feasts)
Loading required package: fabletools

Attaching package: 'fabletools'

The following object is masked from 'package:yardstick':

    accuracy

The following object is masked from 'package:parsnip':

    null_model

The following objects are masked from 'package:infer':

    generate, hypothesize
library(timetk)

# Example: Cache la Poudre River at Mouth (USGS site 06752260)
poudre_dailyflow <- readNWISdv(siteNumber = "06752260",    # Download data from USGS for site 06752260
                          parameterCd = "00060",      # Parameter code 00060 = discharge in cfs)
                          startDate = "2013-01-01",   # Set the start date
                          endDate = "2023-12-31") |>  # Set the end date
  renameNWISColumns()                              # Rename columns to standard names (e.g., "Flow", "Date"
GET:https://waterservices.usgs.gov/nwis/dv/?site=06752260&format=waterml%2C1.1&ParameterCd=00060&StatCd=00003&startDT=2013-01-01&endDT=2023-12-31
daily_tbl <-  tsibble::as_tsibble(poudre_dailyflow) |> 
  as_tibble()
Using `Date` as index variable.
splits <- time_series_split(daily_tbl, assess = "12 months", cumulative = TRUE)
Using date_var: Date
training <-  training(splits)
testing  <-  testing(splits)
mods <- list(
  arima_reg() |>  set_engine("auto_arima"),
  
  prophet_reg() |> set_engine("prophet")
)

models <- map(mods, ~ fit(.x, Flow ~ Date, data = training))
frequency = 7 observations per 1 week
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
(models_tbl <- as_modeltime_table(models))
# Modeltime Table
# A tibble: 2 × 3
  .model_id .model   .model_desc           
      <int> <list>   <chr>                 
1         1 <fit[+]> ARIMA(1,1,2)(0,0,1)[7]
2         2 <fit[+]> PROPHET               
(calibration_table <- modeltime_calibrate(models_tbl, testing, quiet = FALSE))
# Modeltime Table
# A tibble: 2 × 5
  .model_id .model   .model_desc            .type .calibration_data 
      <int> <list>   <chr>                  <chr> <list>            
1         1 <fit[+]> ARIMA(1,1,2)(0,0,1)[7] Test  <tibble [365 × 4]>
2         2 <fit[+]> PROPHET                Test  <tibble [365 × 4]>
modeltime_accuracy(calibration_table) |> 
  arrange(mae)
# A tibble: 2 × 9
  .model_id .model_desc            .type   mae  mape  mase smape  rmse     rsq
      <int> <chr>                  <chr> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
1         2 PROPHET                Test   156. 330.   4.34  154.  269. 0.730  
2         1 ARIMA(1,1,2)(0,0,1)[7] Test   219.  74.5  6.08  128.  510. 0.00161
(forecast <- calibration_table  |> 
  modeltime_forecast(h = "12 months", 
                     new_data = testing,
                     actual_data = daily_tbl))
# Forecast Results
  
Conf Method: conformal_default | Conf Interval: 0.95 | Conf By ID: FALSE
(GLOBAL CONFIDENCE)
# A tibble: 4,749 × 7
   .model_id .model_desc .key   .index     .value .conf_lo .conf_hi
       <int> <chr>       <fct>  <date>      <dbl>    <dbl>    <dbl>
 1        NA ACTUAL      actual 2013-01-01   18.2       NA       NA
 2        NA ACTUAL      actual 2013-01-02   17.9       NA       NA
 3        NA ACTUAL      actual 2013-01-03   17.4       NA       NA
 4        NA ACTUAL      actual 2013-01-04   17.4       NA       NA
 5        NA ACTUAL      actual 2013-01-05   16.9       NA       NA
 6        NA ACTUAL      actual 2013-01-06   15.5       NA       NA
 7        NA ACTUAL      actual 2013-01-07   13.9       NA       NA
 8        NA ACTUAL      actual 2013-01-08   13.5       NA       NA
 9        NA ACTUAL      actual 2013-01-09   13.5       NA       NA
10        NA ACTUAL      actual 2013-01-10   17         NA       NA
# ℹ 4,739 more rows
plot_modeltime_forecast(forecast)
refit_tbl <- calibration_table |>
    modeltime_refit(data = daily_tbl)
frequency = 7 observations per 1 week
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
refit_tbl |>
    modeltime_forecast(h = "12 months", actual_data = daily_tbl) |>
    plot_modeltime_forecast()
refit_forecast <- refit_tbl |>
    modeltime_forecast(h = "12 months", actual_data = daily_tbl)
poudre_2024 <- readNWISdv(siteNumber = "06752260",    # Download data from USGS for site 06752260
                          parameterCd = "00060",      # Parameter code 00060 = discharge in cfs)
                          startDate = "2024-01-01",   # Set the start date
                          endDate = "2024-12-31") |>  # Set the end date
  renameNWISColumns() |>                              # Rename columns to standard names (e.g., "Flow", "Date")
  mutate(Date = yearmonth(Date)) |>                   # Convert daily Date values into a year-month format (e.g., "2023 Jan")
  group_by(Date) |>                                   # Group the data by the new monthly Date
  summarise(Flow = mean(Flow))                       # Calculate the average daily flow for each month
GET:https://waterservices.usgs.gov/nwis/dv/?site=06752260&format=waterml%2C1.1&ParameterCd=00060&StatCd=00003&startDT=2024-01-01&endDT=2024-12-31
forecast2 <- rename(refit_forecast, Date = .index)
forecast2 <- rename(forecast2, Flow = .value)
forecast_plot <- forecast2 %>% 
  filter(.model_desc == c("PROPHET", "ACTUAL")) %>% 
  mutate(Date = yearmonth(Date)) %>% 
  group_by(Date) %>% 
  summarise(Flow = mean(Flow))
Warning: There was 1 warning in `filter()`.
ℹ In argument: `.model_desc == c("PROPHET", "ACTUAL")`.
Caused by warning in `.model_desc == c("PROPHET", "ACTUAL")`:
! longer object length is not a multiple of shorter object length
ggplot() +
  geom_line(data = forecast_plot, aes(x=Date, y = Flow, color = "Predicted")) +
  geom_line(data = poudre_2024, aes(x=Date, y = Flow, color = "Actual")) + 
  theme_minimal() + 
  labs(
    x = "Month",
    y = "Flow (Cubic Feet per Second)",
    color = ""
  )

The prediction of the prophet model is fairly accurate during times of relatively low flow, but severely overestimates peak flow in June. 2024 had exceptionally low average flow during the month of June compared to the rest of the years included in this model, which may explain why the model overestimated so much.

poudre_merge <- poudre_2024 %>% 
  rename(Actual = Flow) %>% 
  mutate(Date2 = as.character(Date))

forecast_merge <- forecast2 %>% 
  filter(.model_desc == "PROPHET") %>% 
  mutate(Date = yearmonth(Date)) %>% 
  group_by(Date) %>% 
  summarise(Flow = mean(Flow)) %>% 
  rename(Predicted = Flow) %>% 
  mutate(Date2 = as.character(Date))

poudre_join <- inner_join(forecast_merge, poudre_merge, by = "Date2") %>% 
  select(c(Date2, Predicted, Actual)) %>% 
  rename(Date = Date2)

poudre_lm<- lm(Predicted ~ Actual, data = poudre_join)
summary(poudre_lm)

Call:
lm(formula = Predicted ~ Actual, data = poudre_join)

Residuals:
    Min      1Q  Median      3Q     Max 
-177.06  -74.31  -27.62   19.65  344.22 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -22.8991    53.0379  -0.432    0.675    
Actual        2.8684     0.3979   7.209  2.9e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 141.2 on 10 degrees of freedom
Multiple R-squared:  0.8386,    Adjusted R-squared:  0.8225 
F-statistic: 51.97 on 1 and 10 DF,  p-value: 2.896e-05

The R-squared value between the model predictions and the observed values is 0.8223, meaning that this model accounts for approximately 82% of the observed variance.

poudre_join %>% 
  ggplot(aes(x = Actual, y = Predicted)) + 
  geom_point()+
  geom_abline(linetype = 1, color = "black") +
  geom_smooth(color = "red", method = "lm", formula = (y ~ x)) +
  theme_linedraw()